The purpose of this document is to put together cluster analysis of the BGT Resume data for nonveterans and veterans.
# Here we create the function to cluster sequences. We can input the sequence object to cluster, one of three sequence distance options (coming from the testing_cluster_combos top performers) and the number of clusters
cluster <- function(name, object, sequence_option = "option 1", cluster_number) {
transition_matrix <- seqtrate(object, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
diag(transition_matrix) = 0
round(transition_matrix,2)
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
for (j in 1:ncol(transition_matrix)){
transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
}
}
for(j in 1:ncol(transition_matrix)){
transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)
if (sequence_option == "option 1"){
# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing <- seqdist(object, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)
}
if (sequence_option == "option 2"){
# OPTION 2 : OUR OM
sequencing <- seqdist(object, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)
}
if (sequence_option == "option 3"){
# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing <- seqdist(object, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)
}
clusterpam <- wcKMedoids(sequencing, k = cluster_number)
#clusterpam$stats
#clusterpam <- clusterpam$clustering
#clusterpam <- factor(clusterpam)
cluster <- list(sequencing, clusterpam)
names(cluster) <- c("diss", "cluster")
assign(name, cluster, envir = .GlobalEnv)
}
# Here we create the function to visualize most of the covariates
cluster_membership_visualization <- function(variable) {
all_df <- all_df %>% select(-skill, -major) %>% distinct()
overall <- all_df %>% filter(!is.na({{ variable }})) %>% mutate(total = n()) %>% group_by(cluster) %>% mutate(overall_cluster_count = n()) %>% group_by({{ variable }}) %>% mutate(overall_covariate_percent = round((n()/total * 100), 2)) %>% distinct({{ variable }}, overall_covariate_percent) %>% mutate(cluster_label = "overall", cluster_group = "overall", overall_cluster_percent = .1) %>% rename(cluster_covariate_percent = overall_covariate_percent)
all_df %>% filter(!is.na({{ variable }})) %>% mutate(total = n()) %>% group_by(cluster) %>% mutate(overall_cluster_count = n(), overall_cluster_percent = n()/total) %>% group_by({{ variable }}) %>% mutate(overall_covariate_percent = round((n()/total * 100), 2)) %>% group_by(cluster, {{ variable }}) %>% mutate(cluster_covariate_percent = round((n()/overall_cluster_count * 100), 2)) %>% distinct(cluster, cluster_label, cluster_group, {{ variable }}, cluster_covariate_percent, overall_cluster_percent) %>% rbind(overall) %>%
ggplot(aes(cluster_covariate_percent, cluster_label, fill = factor({{ variable }}))) +
geom_col(aes(width = overall_cluster_percent*10)) +
facet_wrap(~ factor(cluster_group, levels = c("overall", "education", "promotion", "no change", "demotion", "unemployment")), ncol = 1, scales="free")
}
# Here we create the function to visualize the skill and major covariates
cluster_membership_visualization_skillmajor <- function(variable) {
df <- all_df %>%
filter(!is.na({{ variable }})) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, {{variable}}) %>%
group_by({{ variable }}) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, {{ variable }}) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct({{ variable }}, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by({{ variable }}) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na({{ variable }}))
df <- as.data.frame(df)
ggplot(df, aes(cluster_label, {{ variable }}, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, limits = c(-1, 1), oob = squish, name = "Factor of difference \n of variable percent in \n cluster to variable \n percent overall", breaks = c(-1, 0, 1), labels = c("-2x or less", "0", "2x or more"))+
theme_classic()+
#labs(title = paste0("Cluster membership by ", as.character({{ variable }}))) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_grid(~cluster_group, switch = "x", scales = "free_x", space = "free_x")
}
# Here we create the clusters for both veterans and nonveterans using a function from the master_functions script
# We aren't building in any years of experience for nonveterans here
# These sequence objects are what is used in the test of cluster quality metrics
# NO YEARS EXPERIENCE ------------------------------------------------------------------------
# Creating sequence objects fo 10, 20, and 30 years
seq.10 <- create_seq_all_fixed("seq.10", 10, left_unemp = FALSE, 0)
seq.20 <- create_seq_all_fixed("seq.20", 20, left_unemp = FALSE, 0)
seq.30 <- create_seq_all_fixed("seq.30", 30, left_unemp = FALSE, 0)
# The sequence option and the number of clusters are chosen from the cluster quality metric test
# Clustering sequnces
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)
seq.20cluster_20 <- cluster("cluster_10", seq.20, sequence_option = "option 2", 20)
seq.30cluster_10 <- cluster("cluster_10", seq.30, sequence_option = "option 2", 10)
Final clustering: - Veterans and nonveterans and all - 10, 20, 30 years
HIGH PERFORMERS: 10 YEARS: OPT 3 19-20, OPT 2 5, OPT 1 19-20 20 YEARS: OPT 2 6, OPT 2 19-20 30 YEARS: OPT 2 5, OPT 2 19-20
# Here we test the quality of clusters to see what combination of sequence_option and number of clusters performs the best for four cluster quality metrics
# A more in depth description of this process is explained in 19_testing_cluster_combos
#seq.10 <- create_seq_all_fixed("seq.10", 10, left_unemp = FALSE)
#seq.20 <- create_seq_all_fixed("seq.20", 20, left_unemp = FALSE)
#seq.30 <- create_seq_all_fixed("seq.30", 30, left_unemp = FALSE)
## TEN YEARS --------------
# Creating transition and cost matrix for sequence object
transition_matrix <- seqtrate(seq.10, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
## [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->] 396 54 21 21 8 23
## [2 ->] 37 16088 872 1001 187 421
## [3 ->] 12 572 20513 1578 344 510
## [4 ->] 11 669 1010 44689 1035 1141
## [5 ->] 6 124 282 917 10606 327
## [6 ->] 16 511 734 1747 616 25063
## [Civilian unemployment ->] 35 677 663 1219 373 528
## [-> Civilian unemployment]
## [1 ->] 35
## [2 ->] 798
## [3 ->] 763
## [4 ->] 1271
## [5 ->] 362
## [6 ->] 2085
## [Civilian unemployment ->] 9707
diag(transition_matrix) = 0
round(transition_matrix,2)
## [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->] 0 54 21 21 8 23
## [2 ->] 37 0 872 1001 187 421
## [3 ->] 12 572 0 1578 344 510
## [4 ->] 11 669 1010 0 1035 1141
## [5 ->] 6 124 282 917 0 327
## [6 ->] 16 511 734 1747 616 0
## [Civilian unemployment ->] 35 677 663 1219 373 528
## [-> Civilian unemployment]
## [1 ->] 35
## [2 ->] 798
## [3 ->] 763
## [4 ->] 1271
## [5 ->] 362
## [6 ->] 2085
## [Civilian unemployment ->] 0
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
for (j in 1:ncol(transition_matrix)){
transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
}
}
for(j in 1:ncol(transition_matrix)){
transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
## [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->] 2.00 1.67 1.87 1.87 1.95 1.86
## [2 ->] 1.99 2.00 1.74 1.70 1.94 1.87
## [3 ->] 2.00 1.85 2.00 1.58 1.91 1.87
## [4 ->] 2.00 1.87 1.80 2.00 1.80 1.78
## [5 ->] 2.00 1.94 1.86 1.55 2.00 1.84
## [6 ->] 2.00 1.91 1.87 1.69 1.89 2.00
## [Civilian unemployment ->] 2.00 2.00 2.00 2.00 2.00 2.00
## [-> Civilian unemployment]
## [1 ->] 1.78
## [2 ->] 1.76
## [3 ->] 1.80
## [4 ->] 1.75
## [5 ->] 1.82
## [6 ->] 1.63
## [Civilian unemployment ->] 2.00
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)
# Creating distance matricies for the three distance options
# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing.opt1 <- seqdist(seq.10, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)
# OPTION 2 : OUR OM
sequencing.opt2 <- seqdist(seq.10, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)
# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing.opt3 <- seqdist(seq.10, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)
# Beginning the clustering, we are only using PAM clustering so we are varying the number of clusters and the distance option here
# CLUSTERING
numbers <- (2:20)
pamstats <- NULL
#rownames(pamstats) <- numbers
pamstats <- NULL
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt1, k = number)
assign(paste0("sequencing.pam.opt1.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt1 <- dplyr::bind_rows(pamstats)
pamstats <- NULL
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt2, k = number)
assign(paste0("sequencing.pam.opt2.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt2 <- dplyr::bind_rows(pamstats)
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt3, k = number)
assign(paste0("sequencing.pam.opt3.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt3 <- dplyr::bind_rows(pamstats)
# Here we are getting the 4 quality metrics for each solution to compare them
# QUALITY METRICS
rownames(pamstats.opt1) <- numbers
PAM.ASW.Opt1 <- as.matrix(pamstats.opt1$ASW)
colnames(PAM.ASW.Opt1) <- c("ASW opt1")
PAM.ASW.Opt2 <- as.matrix(pamstats.opt2$ASW)
colnames(PAM.ASW.Opt2) <- c("ASW opt2")
PAM.ASW.Opt3 <- as.matrix(pamstats.opt3$ASW)
colnames(PAM.ASW.Opt3) <- c("ASW opt3")
PAM.ASW <- bind_cols(PAM.ASW.Opt1, PAM.ASW.Opt2, PAM.ASW.Opt3)
PAM.ASW <- PAM.ASW %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.ASW$rowname <- as.numeric(PAM.ASW$rowname)
rownames(pamstats.opt1) <- numbers
PAM.HC.Opt1 <- as.matrix(pamstats.opt1$HC)
colnames(PAM.HC.Opt1) <- c("HC opt1")
PAM.HC.Opt2 <- as.matrix(pamstats.opt2$HC)
colnames(PAM.HC.Opt2) <- c("HC opt2")
PAM.HC.Opt3 <- as.matrix(pamstats.opt3$HC)
colnames(PAM.HC.Opt3) <- c("HC opt3")
PAM.HC <- bind_cols(PAM.HC.Opt1, PAM.HC.Opt2, PAM.HC.Opt3)
PAM.HC <- PAM.HC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.HC$rowname <- as.numeric(PAM.HC$rowname)
rownames(pamstats.opt1) <- numbers
PAM.PBC.Opt1 <- as.matrix(pamstats.opt1$PBC)
colnames(PAM.PBC.Opt1) <- c("PBC opt1")
PAM.PBC.Opt2 <- as.matrix(pamstats.opt2$PBC)
colnames(PAM.PBC.Opt2) <- c("PBC opt2")
PAM.PBC.Opt3 <- as.matrix(pamstats.opt3$PBC)
colnames(PAM.PBC.Opt3) <- c("PBC opt3")
PAM.PBC <- bind_cols(PAM.PBC.Opt1, PAM.PBC.Opt2, PAM.PBC.Opt3)
PAM.PBC <- PAM.PBC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.PBC$rowname <- as.numeric(PAM.PBC$rowname)
rownames(pamstats.opt1) <- numbers
PAM.R2.Opt1 <- as.matrix(pamstats.opt1$R2)
colnames(PAM.R2.Opt1) <- c("R2 opt1")
PAM.R2.Opt2 <- as.matrix(pamstats.opt2$R2)
colnames(PAM.R2.Opt2) <- c("R2 opt2")
PAM.R2.Opt3 <- as.matrix(pamstats.opt3$R2)
colnames(PAM.R2.Opt3) <- c("R2 opt3")
PAM.R2 <- bind_cols(PAM.R2.Opt1, PAM.R2.Opt2, PAM.R2.Opt3)
PAM.R2 <- PAM.R2 %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.R2$rowname <- as.numeric(PAM.R2$rowname)
# Here we visualize the results comparing each cluster solution
PAM.ASW %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "ASW value", type = "viridis") +
# labs(title = "10 Years - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.HC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "HC value", type = "viridis") +
# labs(title = "10 Years - HC (optimal 0)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.PBC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "PBC value", type = "viridis") +
# labs(title = "10 Years - PBC (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.R2 %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "R^2 value", type = "viridis") +
# labs(title = "10 Years - R2 (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw() +
coord_cartesian(clip = 'off') +
theme(plot.margin = unit(c(1,3,1,1), "lines"))
## TWENTY YEARS ---------------------
transition_matrix <- seqtrate(seq.20, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
## [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->] 192 15 8 10 3 6
## [2 ->] 7 12040 470 662 116 176
## [3 ->] 6 364 17773 1119 223 311
## [4 ->] 8 505 771 48192 989 835
## [5 ->] 0 83 185 921 11503 251
## [6 ->] 4 198 422 1318 452 15065
## [Civilian unemployment ->] 18 515 643 1367 423 455
## [-> Civilian unemployment]
## [1 ->] 15
## [2 ->] 467
## [3 ->] 592
## [4 ->] 1099
## [5 ->] 306
## [6 ->] 1561
## [Civilian unemployment ->] 13547
diag(transition_matrix) = 0
round(transition_matrix,2)
## [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->] 0 15 8 10 3 6
## [2 ->] 7 0 470 662 116 176
## [3 ->] 6 364 0 1119 223 311
## [4 ->] 8 505 771 0 989 835
## [5 ->] 0 83 185 921 0 251
## [6 ->] 4 198 422 1318 452 0
## [Civilian unemployment ->] 18 515 643 1367 423 455
## [-> Civilian unemployment]
## [1 ->] 15
## [2 ->] 467
## [3 ->] 592
## [4 ->] 1099
## [5 ->] 306
## [6 ->] 1561
## [Civilian unemployment ->] 0
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
for (j in 1:ncol(transition_matrix)){
transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
}
}
for(j in 1:ncol(transition_matrix)){
transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
## [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->] 2 1.74 1.86 1.82 1.95 1.89
## [2 ->] 2 2.00 1.75 1.65 1.94 1.91
## [3 ->] 2 1.86 2.00 1.57 1.91 1.88
## [4 ->] 2 1.88 1.82 2.00 1.76 1.80
## [5 ->] 2 1.95 1.89 1.47 2.00 1.86
## [6 ->] 2 1.95 1.89 1.67 1.89 2.00
## [Civilian unemployment ->] 2 2.00 2.00 2.00 2.00 2.00
## [-> Civilian unemployment]
## [1 ->] 1.74
## [2 ->] 1.75
## [3 ->] 1.77
## [4 ->] 1.74
## [5 ->] 1.82
## [6 ->] 1.61
## [Civilian unemployment ->] 2.00
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)
# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing.opt1 <- seqdist(seq.20, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)
# OPTION 2 : OUR OM
sequencing.opt2 <- seqdist(seq.20, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)
# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing.opt3 <- seqdist(seq.20, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)
# CLUSTERING
numbers <- (2:20)
pamstats <- NULL
#rownames(pamstats) <- numbers
pamstats <- NULL
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt1, k = number)
assign(paste0("sequencing.pam.opt1.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt1 <- dplyr::bind_rows(pamstats)
pamstats <- NULL
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt2, k = number)
assign(paste0("sequencing.pam.opt2.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt2 <- dplyr::bind_rows(pamstats)
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt3, k = number)
assign(paste0("sequencing.pam.opt3.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt3 <- dplyr::bind_rows(pamstats)
# QUALITY METRICS
rownames(pamstats.opt1) <- numbers
PAM.ASW.Opt1 <- as.matrix(pamstats.opt1$ASW)
colnames(PAM.ASW.Opt1) <- c("ASW opt1")
PAM.ASW.Opt2 <- as.matrix(pamstats.opt2$ASW)
colnames(PAM.ASW.Opt2) <- c("ASW opt2")
PAM.ASW.Opt3 <- as.matrix(pamstats.opt3$ASW)
colnames(PAM.ASW.Opt3) <- c("ASW opt3")
PAM.ASW <- bind_cols(PAM.ASW.Opt1, PAM.ASW.Opt2, PAM.ASW.Opt3)
PAM.ASW <- PAM.ASW %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.ASW$rowname <- as.numeric(PAM.ASW$rowname)
rownames(pamstats.opt1) <- numbers
PAM.HC.Opt1 <- as.matrix(pamstats.opt1$HC)
colnames(PAM.HC.Opt1) <- c("HC opt1")
PAM.HC.Opt2 <- as.matrix(pamstats.opt2$HC)
colnames(PAM.HC.Opt2) <- c("HC opt2")
PAM.HC.Opt3 <- as.matrix(pamstats.opt3$HC)
colnames(PAM.HC.Opt3) <- c("HC opt3")
PAM.HC <- bind_cols(PAM.HC.Opt1, PAM.HC.Opt2, PAM.HC.Opt3)
PAM.HC <- PAM.HC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.HC$rowname <- as.numeric(PAM.HC$rowname)
rownames(pamstats.opt1) <- numbers
PAM.PBC.Opt1 <- as.matrix(pamstats.opt1$PBC)
colnames(PAM.PBC.Opt1) <- c("PBC opt1")
PAM.PBC.Opt2 <- as.matrix(pamstats.opt2$PBC)
colnames(PAM.PBC.Opt2) <- c("PBC opt2")
PAM.PBC.Opt3 <- as.matrix(pamstats.opt3$PBC)
colnames(PAM.PBC.Opt3) <- c("PBC opt3")
PAM.PBC <- bind_cols(PAM.PBC.Opt1, PAM.PBC.Opt2, PAM.PBC.Opt3)
PAM.PBC <- PAM.PBC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.PBC$rowname <- as.numeric(PAM.PBC$rowname)
rownames(pamstats.opt1) <- numbers
PAM.R2.Opt1 <- as.matrix(pamstats.opt1$R2)
colnames(PAM.R2.Opt1) <- c("R2 opt1")
PAM.R2.Opt2 <- as.matrix(pamstats.opt2$R2)
colnames(PAM.R2.Opt2) <- c("R2 opt2")
PAM.R2.Opt3 <- as.matrix(pamstats.opt3$R2)
colnames(PAM.R2.Opt3) <- c("R2 opt3")
PAM.R2 <- bind_cols(PAM.R2.Opt1, PAM.R2.Opt2, PAM.R2.Opt3)
PAM.R2 <- PAM.R2 %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.R2$rowname <- as.numeric(PAM.R2$rowname)
PAM.ASW %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "ASW value", type = "viridis") +
# labs(title = "20 Years - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.HC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "HC value", type = "viridis") +
# labs(title = "20 Years - HC (optimal 0)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.PBC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "PBC value", type = "viridis") +
# labs(title = "20 Years - PBC (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.R2 %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "R^2 value", type = "viridis") +
# labs(title = "20 Years - R2 (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
## THIRTY YEARS -----------------
transition_matrix <- seqtrate(seq.30, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
## [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->] 97 6 2 4 2 3
## [2 ->] 4 5211 202 248 41 55
## [3 ->] 4 157 7945 419 101 128
## [4 ->] 1 213 298 22431 476 326
## [5 ->] 1 38 74 420 6484 118
## [6 ->] 2 47 176 520 186 6299
## [Civilian unemployment ->] 3 267 307 672 221 234
## [-> Civilian unemployment]
## [1 ->] 4
## [2 ->] 185
## [3 ->] 260
## [4 ->] 493
## [5 ->] 140
## [6 ->] 781
## [Civilian unemployment ->] 9118
diag(transition_matrix) = 0
round(transition_matrix,2)
## [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->] 0 6 2 4 2 3
## [2 ->] 4 0 202 248 41 55
## [3 ->] 4 157 0 419 101 128
## [4 ->] 1 213 298 0 476 326
## [5 ->] 1 38 74 420 0 118
## [6 ->] 2 47 176 520 186 0
## [Civilian unemployment ->] 3 267 307 672 221 234
## [-> Civilian unemployment]
## [1 ->] 4
## [2 ->] 185
## [3 ->] 260
## [4 ->] 493
## [5 ->] 140
## [6 ->] 781
## [Civilian unemployment ->] 0
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
for (j in 1:ncol(transition_matrix)){
transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
}
}
for(j in 1:ncol(transition_matrix)){
transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
## [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->] 2.00 1.71 1.90 1.81 1.90 1.86
## [2 ->] 1.99 2.00 1.73 1.66 1.94 1.93
## [3 ->] 2.00 1.85 2.00 1.61 1.91 1.88
## [4 ->] 2.00 1.88 1.84 2.00 1.74 1.82
## [5 ->] 2.00 1.95 1.91 1.47 2.00 1.85
## [6 ->] 2.00 1.97 1.90 1.70 1.89 2.00
## [Civilian unemployment ->] 2.00 2.00 2.00 2.00 2.00 2.00
## [-> Civilian unemployment]
## [1 ->] 1.81
## [2 ->] 1.75
## [3 ->] 1.76
## [4 ->] 1.73
## [5 ->] 1.82
## [6 ->] 1.54
## [Civilian unemployment ->] 2.00
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)
# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing.opt1 <- seqdist(seq.30, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)
# OPTION 2 : OUR OM
sequencing.opt2 <- seqdist(seq.30, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)
# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing.opt3 <- seqdist(seq.30, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)
# CLUSTERING
numbers <- (2:20)
pamstats <- NULL
#rownames(pamstats) <- numbers
pamstats <- NULL
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt1, k = number)
assign(paste0("sequencing.pam.opt1.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt1 <- dplyr::bind_rows(pamstats)
pamstats <- NULL
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt2, k = number)
assign(paste0("sequencing.pam.opt2.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt2 <- dplyr::bind_rows(pamstats)
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt3, k = number)
assign(paste0("sequencing.pam.opt3.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt3 <- dplyr::bind_rows(pamstats)
# QUALITY METRICS
rownames(pamstats.opt1) <- numbers
PAM.ASW.Opt1 <- as.matrix(pamstats.opt1$ASW)
colnames(PAM.ASW.Opt1) <- c("ASW opt1")
PAM.ASW.Opt2 <- as.matrix(pamstats.opt2$ASW)
colnames(PAM.ASW.Opt2) <- c("ASW opt2")
PAM.ASW.Opt3 <- as.matrix(pamstats.opt3$ASW)
colnames(PAM.ASW.Opt3) <- c("ASW opt3")
PAM.ASW <- bind_cols(PAM.ASW.Opt1, PAM.ASW.Opt2, PAM.ASW.Opt3)
PAM.ASW <- PAM.ASW %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.ASW$rowname <- as.numeric(PAM.ASW$rowname)
rownames(pamstats.opt1) <- numbers
PAM.HC.Opt1 <- as.matrix(pamstats.opt1$HC)
colnames(PAM.HC.Opt1) <- c("HC opt1")
PAM.HC.Opt2 <- as.matrix(pamstats.opt2$HC)
colnames(PAM.HC.Opt2) <- c("HC opt2")
PAM.HC.Opt3 <- as.matrix(pamstats.opt3$HC)
colnames(PAM.HC.Opt3) <- c("HC opt3")
PAM.HC <- bind_cols(PAM.HC.Opt1, PAM.HC.Opt2, PAM.HC.Opt3)
PAM.HC <- PAM.HC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.HC$rowname <- as.numeric(PAM.HC$rowname)
rownames(pamstats.opt1) <- numbers
PAM.PBC.Opt1 <- as.matrix(pamstats.opt1$PBC)
colnames(PAM.PBC.Opt1) <- c("PBC opt1")
PAM.PBC.Opt2 <- as.matrix(pamstats.opt2$PBC)
colnames(PAM.PBC.Opt2) <- c("PBC opt2")
PAM.PBC.Opt3 <- as.matrix(pamstats.opt3$PBC)
colnames(PAM.PBC.Opt3) <- c("PBC opt3")
PAM.PBC <- bind_cols(PAM.PBC.Opt1, PAM.PBC.Opt2, PAM.PBC.Opt3)
PAM.PBC <- PAM.PBC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.PBC$rowname <- as.numeric(PAM.PBC$rowname)
rownames(pamstats.opt1) <- numbers
PAM.R2.Opt1 <- as.matrix(pamstats.opt1$R2)
colnames(PAM.R2.Opt1) <- c("R2 opt1")
PAM.R2.Opt2 <- as.matrix(pamstats.opt2$R2)
colnames(PAM.R2.Opt2) <- c("R2 opt2")
PAM.R2.Opt3 <- as.matrix(pamstats.opt3$R2)
colnames(PAM.R2.Opt3) <- c("R2 opt3")
PAM.R2 <- bind_cols(PAM.R2.Opt1, PAM.R2.Opt2, PAM.R2.Opt3)
PAM.R2 <- PAM.R2 %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.R2$rowname <- as.numeric(PAM.R2$rowname)
PAM.ASW %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "ASW value", type = "viridis") +
# labs(title = "30 Years - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.HC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "HC value", type = "viridis") +
# labs(title = "30 Years - HC (optimal 0)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.PBC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "PBC value", type = "viridis") +
# labs(title = "30 Years - PBC (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.R2 %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "R^2 value", type = "viridis") +
# labs(title = "30 Years - R2 (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
# Taking notes on the high performers for 10, 20, and 30 years
## HIGH PERFORMERS:
## 10 YEARS: OPT 3 19-20, OPT 2 5, OPT 1 19-20
## 20 YEARS: OPT 2 6, OPT 2 19-20
## 30 YEARS: OPT 5, OPT 2 19-20
# Visualizing 10 years 20 cluster solution --------------------------------------------------
seq.10cluster_20$cluster$stats
## PBC HG HGSD ASW ASWw CH
## 4.366448e-01 8.629729e-01 8.428392e-01 3.901766e-01 3.909038e-01 1.385791e+03
## R2 CHsq R2sq HC
## 6.115862e-01 2.968367e+03 7.713105e-01 6.726921e-02
cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(6776, 7088, 7089, 7138, 7143, 7189, 7196) ~ "education",
cluster %in% c(9881, 14946, 14955, 16178) ~ "promotion",
#cluster %in% c() ~ "demotion",
cluster %in% c(14833, 14954, 15114, 15790) ~ "unemployment",
cluster %in% c(10037, 14915, 14957, 16193, 16733) ~ "no change"
),
cluster_label = case_when(
cluster == 6776 ~ "education to unemployment to education",
cluster == 7088 ~ "education to zone 5",
cluster == 7089 ~ "education to zone 4",
cluster == 7138 ~ "education to zone 3",
cluster == 7143 ~ "education to unemployment",
cluster == 7189 ~ "zone 4 to education to zone 4",
cluster == 7196 ~ "education to unemployment to zone 4",
cluster == 9881 ~ "zone 4 to zone 5",
cluster == 10037 ~ "zone 5",
cluster == 14833 ~ "zone 4 to unemployment to zone 4",
cluster == 14915 ~ "zone 4 to zone 3 to zone 4",
cluster == 14946 ~ "zone 2 to zone 4",
cluster == 14954 ~ "zone 4 to unemployment",
cluster == 14955 ~ "zone 3 to zone 4",
cluster == 14957 ~ "zone 4",
cluster == 15114 ~ "zone 2 to unemployment to zone 2",
cluster == 15790 ~ "zone 3 to unemployment to zone 3",
cluster == 16178 ~ "zone 2 to zone 3",
cluster == 16193 ~ "zone 3",
cluster == 16733 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
all_df <- all_df %>% mutate(
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
cluster_membership_visualization(veteran)
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
data_profiling(all_df)
## # A tibble: 34 x 5
## var variable_type num_unique num_missing perc_missing
## <chr> <chr> <int> <int> <dbl>
## 1 id character 16742 0 0
## 2 cluster character 20 0 0
## 3 veteran character 2 0 0
## 4 gender character 3 27946 13.5
## 5 zipcode integer 861 0 0
## 6 irr numeric 377 7 0
## 7 highest_degree factor 7 47212 22.8
## 8 forprofit character 3 22 0.01
## 9 hbcu character 3 22 0.01
## 10 tribal character 3 22 0.01
## # … with 24 more rows
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups: cluster_long [20]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 education - education to unemployment architecture 0.760
## 2 education - education to unemployment industry 0.743
## 3 education - education to unemployment economics 0.695
## 4 education - education to unemployment manufacturing 0.646
## 5 education - education to unemployment analysis 0.618
## 6 education - education to unemployment to education science 2.54
## 7 education - education to unemployment to education energy 1.24
## 8 education - education to unemployment to education legal 0.783
## 9 education - education to unemployment to education environment 0.544
## 10 education - education to unemployment to education media 0.544
## # … with 90 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups: cluster_long [20]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 education - education to unemployment security -0.489
## 2 education - education to unemployment human -0.398
## 3 education - education to unemployment marketing -0.266
## 4 education - education to unemployment information -0.244
## 5 education - education to unemployment business -0.125
## 6 education - education to unemployment to education architecture -0.839
## 7 education - education to unemployment to education supply -0.587
## 8 education - education to unemployment to education maintenance -0.532
## 9 education - education to unemployment to education design -0.464
## 10 education - education to unemployment to education sales -0.364
## # … with 90 more rows
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 100 x 3
## # Groups: cluster_long [20]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 education - education to unemployment natural_resource 2.11
## 2 education - education to unemployment library 2.11
## 3 education - education to unemployment culinary_entertainm… 1.80
## 4 education - education to unemployment family_consumer 1.42
## 5 education - education to unemployment education 1.12
## 6 education - education to unemployment to edu… leisure_recreation 3.23
## 7 education - education to unemployment to edu… natural_resource 3.23
## 8 education - education to unemployment to edu… law 3.17
## 9 education - education to unemployment to edu… military_science 2.81
## 10 education - education to unemployment to edu… biology 1.81
## # … with 90 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 101 x 3
## # Groups: cluster_long [20]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 education - education to unemployment computer -0.340
## 2 education - education to unemployment communication -0.308
## 3 education - education to unemployment homeland_security -0.253
## 4 education - education to unemployment architecture -0.189
## 5 education - education to unemployment engineering_tech -0.186
## 6 education - education to unemployment to edu… mathematics -0.698
## 7 education - education to unemployment to edu… recreation_kinesiol… -0.634
## 8 education - education to unemployment to edu… education -0.567
## 9 education - education to unemployment to edu… english -0.499
## 10 education - education to unemployment to edu… homeland_security -0.492
## # … with 91 more rows
# Visualizing 20 years 20 cluster solution --------------------------------------------------
seq.20cluster_20$cluster$stats
## PBC HG HGSD ASW ASWw CH
## 0.55367685 0.90600478 0.90594103 0.26188038 0.26406106 385.90288875
## R2 CHsq R2sq HC
## 0.50632390 952.64294089 0.71686249 0.08681695
cluster_20 <- seq.20cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(60, 95, 299, 1751, 1840, 2627, 2787, 3242) ~ "education",
cluster %in% c(3346, 5976, 6637, 6664) ~ "promotion",
cluster %in% c(4584, 6386) ~ "demotion",
cluster %in% c(5987, 6718) ~ "unemployment",
cluster %in% c(4630, 6650, 7034, 7168) ~ "no change"
),
cluster_label = case_when(
cluster == 60 ~ "education to zone 5",
cluster == 95 ~ "education to zone 4",
cluster == 299 ~ "education to unemployment to zone 4",
cluster == 1751 ~ "education to unemployment to zone 3",
cluster == 1840 ~ "education to unemployment to zone 2",
cluster == 2627 ~ "education to zone 4 to education to zone 4",
cluster == 2787 ~ "education to unemployment",
cluster == 3242 ~ "education to zone 4",
cluster == 3346 ~ "zone 4 to zone 5",
cluster == 4584 ~ "zone 5 to zone 4",
cluster == 4630 ~ "zone 5",
cluster == 5976 ~ "zone 2 to zone 4",
cluster == 5987 ~ "zone 4 to unemployment to zone 4",
cluster == 6386 ~ "zone 4 to zone 3",
cluster == 6637 ~ "zone 3 to zone 4",
cluster == 6650 ~ "zone 4",
cluster == 6664 ~ "zone 2 to zone 3",
cluster == 6718 ~ "zone 3 to unemployment to zone 3",
cluster == 7034 ~ "zone 3",
cluster == 7168 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
all_df <- all_df %>% mutate(
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
seqrplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.20cluster_20$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
cluster_membership_visualization(veteran)
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 95 x 3
## # Groups: cluster_long [19]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 engineering 1.09
## 2 demotion - zone 4 to zone 3 media 0.884
## 3 demotion - zone 4 to zone 3 legal 0.496
## 4 demotion - zone 4 to zone 3 maintenance 0.468
## 5 demotion - zone 4 to zone 3 architecture 0.371
## 6 demotion - zone 5 to zone 4 personal 1.62
## 7 demotion - zone 5 to zone 4 analysis 0.883
## 8 demotion - zone 5 to zone 4 design 0.830
## 9 demotion - zone 5 to zone 4 agriculture 0.758
## 10 demotion - zone 5 to zone 4 education 0.400
## # … with 85 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 95 x 3
## # Groups: cluster_long [19]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 personal -0.590
## 2 demotion - zone 4 to zone 3 health -0.270
## 3 demotion - zone 4 to zone 3 manufacturing -0.267
## 4 demotion - zone 4 to zone 3 education -0.204
## 5 demotion - zone 4 to zone 3 design -0.183
## 6 demotion - zone 5 to zone 4 security -0.656
## 7 demotion - zone 5 to zone 4 environment -0.498
## 8 demotion - zone 5 to zone 4 manufacturing -0.414
## 9 demotion - zone 5 to zone 4 supply -0.275
## 10 demotion - zone 5 to zone 4 business -0.261
## # … with 85 more rows
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 97 x 3
## # Groups: cluster_long [19]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 construction 18.2
## 2 demotion - zone 4 to zone 3 cultural_gender 2.03
## 3 demotion - zone 4 to zone 3 english 1.02
## 4 demotion - zone 4 to zone 3 culinary_entertainment 0.918
## 5 demotion - zone 4 to zone 3 architecture 0.668
## 6 demotion - zone 5 to zone 4 transportation_materials 2.79
## 7 demotion - zone 5 to zone 4 education 2.10
## 8 demotion - zone 5 to zone 4 family_consumer 1.53
## 9 demotion - zone 5 to zone 4 mathematics 1.17
## 10 demotion - zone 5 to zone 4 law 1.13
## # … with 87 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 96 x 3
## # Groups: cluster_long [19]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 visual_arts -0.436
## 2 demotion - zone 4 to zone 3 psychology -0.312
## 3 demotion - zone 4 to zone 3 health_professions -0.274
## 4 demotion - zone 4 to zone 3 engineering_tech -0.228
## 5 demotion - zone 4 to zone 3 public_administration -0.195
## 6 demotion - zone 5 to zone 4 homeland_security -0.545
## 7 demotion - zone 5 to zone 4 social_sciences -0.462
## 8 demotion - zone 5 to zone 4 engineering -0.445
## 9 demotion - zone 5 to zone 4 communication -0.393
## 10 demotion - zone 5 to zone 4 recreation_kinesiology -0.344
## # … with 86 more rows
# Visualizing 30 years 10 cluster solution --------------------------------------------------
seq.30cluster_10$cluster$stats
## PBC HG HGSD ASW ASWw CH
## 0.5416033 0.8251470 0.8251211 0.2054577 0.2089100 167.1695255
## R2 CHsq R2sq HC
## 0.4011506 402.1089344 0.6170490 0.1333264
cluster_20 <- seq.30cluster_10$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(380, 429, 506, 880, 1109) ~ "education",
cluster %in% c(2089) ~ "promotion",
#cluster %in% c() ~ "demotion",
#cluster %in% c() ~ "unemployment",
cluster %in% c(584, 2126, 2216, 2255) ~ "no change"
),
cluster_label = case_when(
cluster == 380 ~ "education to zone 4",
cluster == 429 ~ "education to unemployment to zone 3",
cluster == 506 ~ "education to zone 4 to zone 5",
cluster == 584 ~ "zone 5",
cluster == 880 ~ "education to unemployment to zone 4",
cluster == 1109 ~ "education to unemployment",
cluster == 2089 ~ "zone 3 to zone 4",
cluster == 2126 ~ "zone 4",
cluster == 2216 ~ "zone 3",
cluster == 2255 ~ "zone 2",
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
all_df <- all_df %>% mutate(
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
seqdplot(seq.30, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
seqrplot(seq.30, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.30cluster_10$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
cluster_membership_visualization(veteran)
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 50 x 3
## # Groups: cluster_long [10]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 education - education to unemployment environment 0.462
## 2 education - education to unemployment personal 0.388
## 3 education - education to unemployment customer 0.338
## 4 education - education to unemployment science 0.235
## 5 education - education to unemployment sales 0.127
## 6 education - education to unemployment to zone 3 agriculture 0.717
## 7 education - education to unemployment to zone 3 environment 0.472
## 8 education - education to unemployment to zone 3 media 0.367
## 9 education - education to unemployment to zone 3 design 0.341
## 10 education - education to unemployment to zone 3 supply 0.201
## # … with 40 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 50 x 3
## # Groups: cluster_long [10]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 education - education to unemployment energy -0.423
## 2 education - education to unemployment industry -0.302
## 3 education - education to unemployment architecture -0.295
## 4 education - education to unemployment design -0.273
## 5 education - education to unemployment legal -0.252
## 6 education - education to unemployment to zone 3 industry -0.356
## 7 education - education to unemployment to zone 3 science -0.240
## 8 education - education to unemployment to zone 3 legal -0.203
## 9 education - education to unemployment to zone 3 economics -0.165
## 10 education - education to unemployment to zone 3 business -0.160
## # … with 40 more rows
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 50 x 3
## # Groups: cluster_long [10]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 education - education to unemployment transportation_materi… 1.73
## 2 education - education to unemployment architecture 1.14
## 3 education - education to unemployment leisure_recreation 0.821
## 4 education - education to unemployment military_science 0.639
## 5 education - education to unemployment family_consumer 0.518
## 6 education - education to unemployment to z… library 4.41
## 7 education - education to unemployment to z… military_science 2.25
## 8 education - education to unemployment to z… philosophy_religion 1.16
## 9 education - education to unemployment to z… natural_resource 0.803
## 10 education - education to unemployment to z… english 0.708
## # … with 40 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 50 x 3
## # Groups: cluster_long [10]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 education - education to unemployment culinary_entertainm… -0.590
## 2 education - education to unemployment agriculture_veterin… -0.415
## 3 education - education to unemployment cultural_gender -0.353
## 4 education - education to unemployment visual_arts -0.277
## 5 education - education to unemployment communication -0.246
## 6 education - education to unemployment to zon… education -0.631
## 7 education - education to unemployment to zon… mathematics -0.614
## 8 education - education to unemployment to zon… family_consumer -0.399
## 9 education - education to unemployment to zon… humanities -0.340
## 10 education - education to unemployment to zon… architecture -0.294
## # … with 40 more rows
# 15 YEARS EXPERIENCE ------------------------------------------------------------------------
# Creating sequence objects fo 10, 20, and 30 years
seq.10 <- create_seq_all_fixed("seq.10", 10, left_unemp = FALSE, 15)
seq.20 <- create_seq_all_fixed("seq.20", 20, left_unemp = FALSE, 15)
seq.30 <- create_seq_all_fixed("seq.30", 30, left_unemp = FALSE, 15)
# Clustering sequnces
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)
seq.20cluster_20 <- cluster("cluster_10", seq.20, sequence_option = "option 2", 20)
seq.30cluster_10 <- cluster("cluster_10", seq.30, sequence_option = "option 2", 10)
# Visualizing 10 years 20 cluster solution --------------------------------------------------
seq.10cluster_20$cluster$stats
## PBC HG HGSD ASW ASWw CH
## 4.652542e-01 8.711621e-01 8.533859e-01 4.448714e-01 4.464166e-01 7.324497e+02
## R2 CHsq R2sq HC
## 6.343133e-01 1.364724e+03 7.637011e-01 4.976412e-02
cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(483, 1470, 2107, 2136, 2154) ~ "education",
cluster %in% c(4183, 7140, 7149, 7746) ~ "promotion",
cluster %in% c(4172, 7133) ~ "demotion",
cluster %in% c(4166, 6940, 7114, 7725) ~ "unemployment",
cluster %in% c(4139, 7116, 7151, 7761, 8042) ~ "no change"
),
cluster_label = case_when(
cluster == 483 ~ "education to zone 5",
cluster == 1470 ~ "zone 4 to education to zone 4",
cluster == 2107 ~ "zone 3 to education to zone 3",
cluster == 2136 ~ "zone 4 to education",
cluster == 2154 ~ "zone 4 to education to zone 4",
cluster == 4139 ~ "zone 4 to zone 5 to zone 4",
cluster == 4166 ~ "zone 5 to unemployment",
cluster == 4172 ~ "zone 5 to zone 4",
cluster == 4183 ~ "zone 4 to zone 5",
cluster == 6940 ~ "zone 4 to unemployment to zone 4",
cluster == 7114 ~ "zone 4 to unemployment",
cluster == 7116 ~ "zone 4 to zone 3 to zone 4",
cluster == 7133 ~ "zone 4 to zone 3",
cluster == 7140 ~ "zone 2 to zone 4",
cluster == 7149 ~ "zone 3 to zone 4",
cluster == 7151 ~ "zone 4",
cluster == 7725 ~ "zone 3 to unemployment",
cluster == 7746 ~ "zone 2 to zone 3",
cluster == 7761 ~ "zone 3",
cluster == 8042 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
all_df <- all_df %>% mutate(
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
cluster_membership_visualization(veteran)
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 95 x 3
## # Groups: cluster_long [19]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 analysis 0.945
## 2 demotion - zone 4 to zone 3 design 0.815
## 3 demotion - zone 4 to zone 3 engineering 0.760
## 4 demotion - zone 4 to zone 3 maintenance 0.535
## 5 demotion - zone 4 to zone 3 customer 0.496
## 6 demotion - zone 5 to zone 4 economics 2.53
## 7 demotion - zone 5 to zone 4 engineering 1.12
## 8 demotion - zone 5 to zone 4 marketing 0.476
## 9 demotion - zone 5 to zone 4 architecture 0.396
## 10 demotion - zone 5 to zone 4 business 0.305
## # … with 85 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 95 x 3
## # Groups: cluster_long [19]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 marketing -0.700
## 2 demotion - zone 4 to zone 3 business -0.435
## 3 demotion - zone 4 to zone 3 manufacturing -0.428
## 4 demotion - zone 4 to zone 3 media -0.381
## 5 demotion - zone 4 to zone 3 health -0.231
## 6 demotion - zone 5 to zone 4 maintenance -0.580
## 7 demotion - zone 5 to zone 4 environment -0.468
## 8 demotion - zone 5 to zone 4 manufacturing -0.437
## 9 demotion - zone 5 to zone 4 finance -0.282
## 10 demotion - zone 5 to zone 4 customer -0.264
## # … with 85 more rows
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 96 x 3
## # Groups: cluster_long [19]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 culinary_entertainment 7.32
## 2 demotion - zone 4 to zone 3 philosophy_religion 1.45
## 3 demotion - zone 4 to zone 3 interdisciplinary 1.31
## 4 demotion - zone 4 to zone 3 visual_arts 1.31
## 5 demotion - zone 4 to zone 3 law 1.31
## 6 demotion - zone 5 to zone 4 architecture 3.83
## 7 demotion - zone 5 to zone 4 agriculture_veterinary 1.61
## 8 demotion - zone 5 to zone 4 interdisciplinary 1.32
## 9 demotion - zone 5 to zone 4 cultural_gender 1.32
## 10 demotion - zone 5 to zone 4 law 1.32
## # … with 86 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 95 x 3
## # Groups: cluster_long [19]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 communication -0.685
## 2 demotion - zone 4 to zone 3 engineering -0.593
## 3 demotion - zone 4 to zone 3 psychology -0.524
## 4 demotion - zone 4 to zone 3 public_administration -0.492
## 5 demotion - zone 4 to zone 3 language_linguistics -0.184
## 6 demotion - zone 5 to zone 4 physics -0.782
## 7 demotion - zone 5 to zone 4 health_professions -0.598
## 8 demotion - zone 5 to zone 4 humanities -0.346
## 9 demotion - zone 5 to zone 4 business -0.303
## 10 demotion - zone 5 to zone 4 computer -0.289
## # … with 85 more rows
# Visualizing 20 years 20 cluster solution --------------------------------------------------
seq.20cluster_20$cluster$stats
## PBC HG HGSD ASW ASWw CH
## 0.59830170 0.90939171 0.90930998 0.28257954 0.28884498 135.12292411
## R2 CHsq R2sq HC
## 0.50445398 316.19414619 0.70432684 0.06991502
cluster_20 <- seq.20cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(123, 303, 530, 639) ~ "education",
cluster %in% c(1004, 1196, 1598, 2244, 2381) ~ "promotion",
cluster %in% c(965, 1541, 2172, 2487, 2542) ~ "demotion",
cluster %in% c(958, 1822, 2310) ~ "unemployment",
cluster %in% c(1452, 1461, 2371) ~ "no change"
),
cluster_label = case_when(
cluster == 123 ~ "education to unemployment",
cluster == 303 ~ "zone 4 to education to zone 4",
cluster == 530 ~ "education to zone 5",
cluster == 639 ~ "education to zone 4",
cluster == 958 ~ "zone 5 to unemployment to zone 5",
cluster == 965 ~ "zone 5 to zone 4",
cluster == 1004 ~ "zone 4 to zone 5 (Late career)",
cluster == 1196 ~ "zone 4 to zone 5 (Early career)",
cluster == 1452 ~ "zone 4 to zone 5 to zone 4",
cluster == 1461 ~ "zone 5",
cluster == 1541 ~ "zone 4 to zone 3",
cluster == 1598 ~ "zone 3 to zone 4",
cluster == 1822 ~ "zone 4 to unemployment",
cluster == 2172 ~ "zone 4 to zone 2",
cluster == 2244 ~ "zone 2 to zone 4",
cluster == 2310 ~ "zone 4 to unemployment to zone 4",
cluster == 2371 ~ "zone 4",
cluster == 2381 ~ "zone 2 to zone 3",
cluster == 2487 ~ "zone 4 to zone 3",
cluster == 2542 ~ "zone 4 to zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
all_df <- all_df %>% mutate(
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
seqrplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.20cluster_20$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
cluster_membership_visualization(veteran)
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 90 x 3
## # Groups: cluster_long [18]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 2 analysis 4.36
## 2 demotion - zone 4 to zone 2 agriculture 1.84
## 3 demotion - zone 4 to zone 2 economics 1.47
## 4 demotion - zone 4 to zone 2 environment 1.23
## 5 demotion - zone 4 to zone 2 manufacturing 1.00
## 6 demotion - zone 4 to zone 3 science 1.35
## 7 demotion - zone 4 to zone 3 environment 0.946
## 8 demotion - zone 4 to zone 3 personal 0.747
## 9 demotion - zone 4 to zone 3 industry 0.733
## 10 demotion - zone 4 to zone 3 manufacturing 0.488
## # … with 80 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 91 x 3
## # Groups: cluster_long [18]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 2 human -0.797
## 2 demotion - zone 4 to zone 2 engineering -0.596
## 3 demotion - zone 4 to zone 2 education -0.526
## 4 demotion - zone 4 to zone 2 personal -0.491
## 5 demotion - zone 4 to zone 2 analysis -0.433
## 6 demotion - zone 4 to zone 3 maintenance -0.779
## 7 demotion - zone 4 to zone 3 analysis -0.611
## 8 demotion - zone 4 to zone 3 manufacturing -0.505
## 9 demotion - zone 4 to zone 3 media -0.464
## 10 demotion - zone 4 to zone 3 security -0.425
## # … with 81 more rows
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 93 x 3
## # Groups: cluster_long [18]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 2 recreation_kinesiology 4.35
## 2 demotion - zone 4 to zone 2 culinary_entertainment 3.64
## 3 demotion - zone 4 to zone 2 cultural_gender 1.58
## 4 demotion - zone 4 to zone 2 engineering_tech 1.56
## 5 demotion - zone 4 to zone 2 language_linguistics 1.53
## 6 demotion - zone 4 to zone 3 library 13.1
## 7 demotion - zone 4 to zone 3 leisure_recreation 8.51
## 8 demotion - zone 4 to zone 3 culinary_entertainment 4.65
## 9 demotion - zone 4 to zone 3 military_science 2.57
## 10 demotion - zone 4 to zone 3 architecture 2.29
## # … with 83 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 90 x 3
## # Groups: cluster_long [18]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 2 physics -0.758
## 2 demotion - zone 4 to zone 2 public_administration -0.717
## 3 demotion - zone 4 to zone 2 engineering_tech -0.448
## 4 demotion - zone 4 to zone 2 homeland_security -0.434
## 5 demotion - zone 4 to zone 2 computer -0.369
## 6 demotion - zone 4 to zone 3 psychology -0.677
## 7 demotion - zone 4 to zone 3 biology -0.660
## 8 demotion - zone 4 to zone 3 health_professions -0.457
## 9 demotion - zone 4 to zone 3 theology_religion -0.380
## 10 demotion - zone 4 to zone 3 public_administration -0.311
## # … with 80 more rows
# APPROXIMATE AGE ---------------------------------------------------------------
# Cohort clustering whole career ------------------------------------------------
seq.10 <- create_seq_all_cohort("seq.10", 2000, 2005, after_military = FALSE, left_unemp = FALSE)
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)
cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(2791, 6868, 6884, 6885, 6888) ~ "education",
cluster %in% c(8748, 14400, 14430, 16736) ~ "promotion",
cluster %in% c(8741, 14409, 14414) ~ "demotion",
cluster %in% c(14053) ~ "unemployment",
cluster %in% c(2830, 2831, 6883, 8743, 14435, 16744, 18571) ~ "no change"
),
cluster_label = case_when(
cluster == 2791 ~ "education to military",
cluster == 2809 ~ "military",
cluster == 2830 ~ "military to zone 4",
cluster == 2831 ~ "education to zone 4",
cluster == 6862 ~ "zone 4 to education to military/zone 2",
cluster == 6885 ~ "education to zone 3",
cluster == 6887 ~ "education to zone 2",
cluster == 6895 ~ "education to zone 5",
cluster == 6897 ~ "zone 5 to zone 4",
cluster == 8763 ~ "zone 5",
cluster == 8764 ~ "zone 4 to zone 5",
cluster == 8765 ~ "unemployment to zone 4",
cluster == 14495 ~ "zone 2 to zone 4",
cluster == 14510 ~ "zone 4 to zone 3",
cluster == 14513 ~ "zone 4 to zone 2",
cluster == 14519 ~ "zone 3 to zone 4",
cluster == 14520 ~ "zone 4",
cluster == 16825 ~ "zone 2 to zone 3",
cluster == 16834 ~ "zone 3",
cluster == 18637 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
all_df <- all_df %>% mutate(
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
seq.10 <- seq.10[,1:20]
seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", axes = c(1,20),
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "red", "#CBBEB5"))
seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "red", "#CBBEB5"))
cluster_membership_visualization(veteran)
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
# Filtering for five years of experience and careers of 10 years ----------------------------------
seq.10 <- create_seq_all_fixed_experience("seq.10", 10, left_unemp = FALSE, 5)
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)
cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(3140, 3148, 3191, 3212) ~ "education",
cluster %in% c(5016, 8681, 8684, 9682) ~ "promotion",
cluster %in% c(5053, 8641) ~ "demotion",
cluster %in% c(8523, 8670, 9664, 10161) ~ "unemployment",
cluster %in% c(5074, 5078, 8310, 8675, 9682, 10156) ~ "no change"
),
cluster_label = case_when(
cluster == 3140 ~ "education to zone 4",
cluster == 3148 ~ "education to unemployment",
cluster == 3191 ~ "zone 4 to education to zone 4",
cluster == 3212 ~ "education to zone 5",
cluster == 5016 ~ "zone 4 to zone 5",
cluster == 5053 ~ "zone 5 to zone 4",
cluster == 5074 ~ "zone 5",
cluster == 5078 ~ "zone 4 to zone 5 to zone 4",
cluster == 8310 ~ "zone 4",
cluster == 8523 ~ "zone 4 to unemployment to zone 4",
cluster == 8641 ~ "zone 4 to zone 3",
cluster == 8670 ~ "zone 4 to unemployment",
cluster == 8675 ~ "zone 4 to zone 3 to zone 4",
cluster == 8681 ~ "zone 2 to zone 4",
cluster == 8684 ~ "zone 3 to zone 4",
cluster == 9664 ~ "zone 3 to unemployment to zone 3",
cluster == 9682 ~ "zone 2 to zone 3",
cluster == 9698 ~ "zone 3",
cluster == 10156 ~ "zone 2",
cluster == 10161 ~ "zone 2 to unemployment to zone 2",
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
all_df <- all_df %>% mutate(
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", axes = c(1,20),
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
cluster_membership_visualization(veteran)
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
# VETERANS -----------------------------------------------------------------------------
# Creating sequence objects for 10, 20, and 30 years after exiting the military for veterans
seq.10 <- create_seq_vet_fixed("seq.10", 10, left_unemp = FALSE)
seq.20 <- create_seq_vet_fixed("seq.20", 20, left_unemp = FALSE)
seq.30 <- create_seq_vet_fixed("seq.30", 30, left_unemp = FALSE)
# Clustering sequnces
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)
seq.20cluster_20 <- cluster("cluster_10", seq.20, sequence_option = "option 2", 20)
seq.30cluster_10 <- cluster("cluster_10", seq.30, sequence_option = "option 2", 10)
# Visualizing 10 years 20 cluster solution ----------------------
seq.10cluster_20$cluster$stats
## PBC HG HGSD ASW ASWw CH
## 0.42080980 0.82953931 0.80984445 0.37621229 0.37882230 389.84777555
## R2 CHsq R2sq HC
## 0.60628979 727.08985966 0.74174091 0.06945614
cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(781, 1360, 1435, 1511) ~ "education",
cluster %in% c(2607, 4345, 4416, 4690) ~ "promotion",
cluster %in% c(2769, 4409) ~ "demotion",
cluster %in% c(2727, 4303, 4390, 4684) ~ "unemployment",
cluster %in% c(2763, 2772, 4169, 4427, 4705, 4830) ~ "no change"
),
cluster_label = case_when(
cluster == 781 ~ "zone 5 to education to zone 5",
cluster == 1360 ~ "zone 4 to education to zone 4",
cluster == 1435 ~ "education to zone 4",
cluster == 1511 ~ "zone 4 to education",
cluster == 2607 ~ "zone 4 to zone 5",
cluster == 2727 ~ "zone 4 to unemployment to zone 5",
cluster == 2763 ~ "zone 5",
cluster == 2769 ~ "zone 5 to zone 4",
cluster == 2772 ~ "zone 4 to zone 5 zone 4",
cluster == 4169 ~ "zone 4 to zone 3 to zone 4",
cluster == 4303 ~ "zone 4 to unemployment to zone 4",
cluster == 4345 ~ "zone 3 to zone 4",
cluster == 4390 ~ "zone 4 to unemployment",
cluster == 4409 ~ "zone 4 to zone 3",
cluster == 4416 ~ "zone 2 to zone 4",
cluster == 4427 ~ "zone 4",
cluster == 4684 ~ "zone 3 to unemployment",
cluster == 4690 ~ "zone 2 to zone 3",
cluster == 4705 ~ "zone 3",
cluster == 4830 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
# Plotting
seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, with.legend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
all_df <- all_df %>% mutate(end_onet55_label = case_when(
end_onet55 == "55-3011.00" ~ "Air Crew Members",
end_onet55 == "55-1011.00" ~ "Air Crew Officers",
end_onet55 == "55-1012.00" ~ "Aircraft Launch and Recovery Officers",
end_onet55 == "55-3012.00" ~ "Aircraft Launch and Recovery Specialists",
end_onet55 == "55-3013.00" ~ "Armored Assault Vehicle Crew Members",
end_onet55 == "55-1013.00" ~ "Armored Assault Vehicle Officers",
end_onet55 == "55-3014.00" ~ "Artillery and Missile Crew Members",
end_onet55 == "55-1014.00" ~ "Artillery and Missile Officers",
end_onet55 == "55-1015.00" ~ "Command and Control Center Officers",
end_onet55 == "55-3015.00" ~ "Command and Control Center Specialists",
end_onet55 == "55-2011.00" ~ "First-Line Supervisors of Air Crew Members",
end_onet55 == "55-2013.00" ~ "First-Line Supervisors of All Other Tactical Operations Specialists",
end_onet55 == "55-2012.00" ~ "First-Line Supervisors of Weapons Specialists/Crew Members",
end_onet55 == "55-3016.00" ~ "Infantry",
end_onet55 == "55-1016.00" ~ "Infantry Officers",
end_onet55 == "55-3019.00" ~ "Military Enlisted Tactical Operations and Air/Weapons Specialists and Crew Members, All Other",
end_onet55 == "55-1019.00" ~ "Military Officer Special and Tactical Operations Leaders, All Other",
end_onet55 == "55-3018.00" ~ "Special Forces",
end_onet55 == "55-1017.00" ~ "Special Forces Officers",
), years_in_military_bin = factor(years_in_military_bin, levels = c("1 - 2", "3 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")),
year_military_enlist_bin = factor(year_military_enlist_bin, levels = c("Vietnam era (1964 to 1974)", "1975 to 1989", "1990 to 2000 (including Persian Gulf War)", "2001 or later")),
career_before_military_bin = factor(career_before_military_bin, levels = c("0", "1 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")),
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization(officer)
cluster_membership_visualization(end_onet55_label)
cluster_membership_visualization(years_in_military_bin)
cluster_membership_visualization(year_military_enlist_bin)
cluster_membership_visualization(career_before_military_bin)
cluster_membership_visualization_skillmajor(end_onet55_label)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups: cluster_long [20]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 design 2.22
## 2 demotion - zone 4 to zone 3 analysis 1.49
## 3 demotion - zone 4 to zone 3 environment 1.19
## 4 demotion - zone 4 to zone 3 legal 1.03
## 5 demotion - zone 4 to zone 3 sales 0.727
## 6 demotion - zone 5 to zone 4 economics 2.97
## 7 demotion - zone 5 to zone 4 media 1.92
## 8 demotion - zone 5 to zone 4 engineering 1.48
## 9 demotion - zone 5 to zone 4 analysis 0.806
## 10 demotion - zone 5 to zone 4 business 0.359
## # … with 90 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups: cluster_long [20]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 health -0.569
## 2 demotion - zone 4 to zone 3 security -0.424
## 3 demotion - zone 4 to zone 3 business -0.368
## 4 demotion - zone 4 to zone 3 education -0.353
## 5 demotion - zone 4 to zone 3 engineering -0.240
## 6 demotion - zone 5 to zone 4 maintenance -0.884
## 7 demotion - zone 5 to zone 4 manufacturing -0.712
## 8 demotion - zone 5 to zone 4 architecture -0.577
## 9 demotion - zone 5 to zone 4 human -0.538
## 10 demotion - zone 5 to zone 4 finance -0.304
## # … with 90 more rows
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 101 x 3
## # Groups: cluster_long [20]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 law 6.50
## 2 demotion - zone 4 to zone 3 engineering_tech 2.05
## 3 demotion - zone 4 to zone 3 homeland_security 1.36
## 4 demotion - zone 4 to zone 3 computer 1.05
## 5 demotion - zone 4 to zone 3 humanities 0.963
## 6 demotion - zone 5 to zone 4 agriculture_veterinary 2.61
## 7 demotion - zone 5 to zone 4 education 2.61
## 8 demotion - zone 5 to zone 4 law 2.28
## 9 demotion - zone 5 to zone 4 architecture 1.26
## 10 demotion - zone 5 to zone 4 public_administration 0.935
## # … with 91 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 100 x 3
## # Groups: cluster_long [20]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 3 engineering -0.518
## 2 demotion - zone 4 to zone 3 business -0.265
## 3 demotion - zone 4 to zone 3 social_sciences 0.461
## 4 demotion - zone 4 to zone 3 psychology 0.666
## 5 demotion - zone 4 to zone 3 humanities 0.963
## 6 demotion - zone 5 to zone 4 humanities -0.570
## 7 demotion - zone 5 to zone 4 language_linguistics -0.484
## 8 demotion - zone 5 to zone 4 physics -0.366
## 9 demotion - zone 5 to zone 4 engineering_tech -0.331
## 10 demotion - zone 5 to zone 4 business -0.287
## # … with 90 more rows
# Visualizing 20 years 20 cluster solution --------------------------------------------------
seq.20cluster_20$cluster$stats
## PBC HG HGSD ASW ASWw CH
## 0.53963236 0.87397820 0.87388810 0.21171959 0.22112803 84.55773514
## R2 CHsq R2sq HC
## 0.48587626 192.33936296 0.68250695 0.09853612
cluster_20 <- seq.20cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(91, 163, 343, 452) ~ "education",
cluster %in% c(628, 1020, 1071, 1348, 1643) ~ "promotion",
cluster %in% c(576, 1363, 1428, 1690, 1720) ~ "demotion",
cluster %in% c(668, 1078, 1566) ~ "unemployment",
cluster %in% c(1018, 1027, 1627) ~ "no change"
),
cluster_label = case_when(
cluster == 91 ~ "education to unemployment",
cluster == 163 ~ "zone 4 to education to zone 4",
cluster == 343 ~ "education to zone 5",
cluster == 452 ~ "education to zone 4",
cluster == 576 ~ "zone 5 to zone 4",
cluster == 628 ~ "zone 4 to zone 5 (early career)",
cluster == 668 ~ "zone 4/5 to unemployment to zone 5",
cluster == 1018 ~ "zone 4 to zone 5 to zone 4",
cluster == 1020 ~ "zone 4 to zone 5 (late career)",
cluster == 1027 ~ "zone 5",
cluster == 1071 ~ "zone 3 to zone 4 (early career)",
cluster == 1078 ~ "zone 4 to unemployment",
cluster == 1348 ~ "zone 3 to zone 4 (mid career)",
cluster == 1363 ~ "zone 4 to zone 3 (mid career)",
cluster == 1428 ~ "zone 4 to zone 2 (mid career)",
cluster == 1566 ~ "zone 4 to unemployment to zone 4",
cluster == 1627 ~ "zone 4",
cluster == 1643 ~ "zone 2 to zone 3",
cluster == 1690 ~ "zone 4 to zone 3 (early career)",
cluster == 1720 ~ "zone 4 to zone 2 (early career)"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
# Plotting
seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
seqrplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, with.legend = F, sortv = "from.start", diss = seq.20cluster_20$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
all_df <- all_df %>% mutate(end_onet55_label = case_when(
end_onet55 == "55-3011.00" ~ "Air Crew Members",
end_onet55 == "55-1011.00" ~ "Air Crew Officers",
end_onet55 == "55-1012.00" ~ "Aircraft Launch and Recovery Officers",
end_onet55 == "55-3012.00" ~ "Aircraft Launch and Recovery Specialists",
end_onet55 == "55-3013.00" ~ "Armored Assault Vehicle Crew Members",
end_onet55 == "55-1013.00" ~ "Armored Assault Vehicle Officers",
end_onet55 == "55-3014.00" ~ "Artillery and Missile Crew Members",
end_onet55 == "55-1014.00" ~ "Artillery and Missile Officers",
end_onet55 == "55-1015.00" ~ "Command and Control Center Officers",
end_onet55 == "55-3015.00" ~ "Command and Control Center Specialists",
end_onet55 == "55-2011.00" ~ "First-Line Supervisors of Air Crew Members",
end_onet55 == "55-2013.00" ~ "First-Line Supervisors of All Other Tactical Operations Specialists",
end_onet55 == "55-2012.00" ~ "First-Line Supervisors of Weapons Specialists/Crew Members",
end_onet55 == "55-3016.00" ~ "Infantry",
end_onet55 == "55-1016.00" ~ "Infantry Officers",
end_onet55 == "55-3019.00" ~ "Military Enlisted Tactical Operations and Air/Weapons Specialists and Crew Members, All Other",
end_onet55 == "55-1019.00" ~ "Military Officer Special and Tactical Operations Leaders, All Other",
end_onet55 == "55-3018.00" ~ "Special Forces",
end_onet55 == "55-1017.00" ~ "Special Forces Officers",
), years_in_military_bin = factor(years_in_military_bin, levels = c("1 - 2", "3 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")),
year_military_enlist_bin = factor(year_military_enlist_bin, levels = c("Vietnam era (1964 to 1974)", "1975 to 1989", "1990 to 2000 (including Persian Gulf War)", "2001 or later")),
career_before_military_bin = factor(career_before_military_bin, levels = c("0", "1 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")),
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization(officer)
cluster_membership_visualization(end_onet55_label)
cluster_membership_visualization(years_in_military_bin)
cluster_membership_visualization(year_military_enlist_bin)
cluster_membership_visualization(career_before_military_bin)
cluster_membership_visualization_skillmajor(end_onet55_label)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups: cluster_long [20]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 2 (early career) personal 4.43
## 2 demotion - zone 4 to zone 2 (early career) science 0.810
## 3 demotion - zone 4 to zone 2 (early career) legal 0.584
## 4 demotion - zone 4 to zone 2 (early career) security 0.286
## 5 demotion - zone 4 to zone 2 (early career) finance 0.167
## 6 demotion - zone 4 to zone 2 (mid career) analysis 3.08
## 7 demotion - zone 4 to zone 2 (mid career) energy 1.80
## 8 demotion - zone 4 to zone 2 (mid career) customer 1.09
## 9 demotion - zone 4 to zone 2 (mid career) engineering 0.869
## 10 demotion - zone 4 to zone 2 (mid career) environment 0.794
## # … with 90 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups: cluster_long [20]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 2 (early career) manufacturing -0.646
## 2 demotion - zone 4 to zone 2 (early career) architecture -0.480
## 3 demotion - zone 4 to zone 2 (early career) customer -0.432
## 4 demotion - zone 4 to zone 2 (early career) supply -0.391
## 5 demotion - zone 4 to zone 2 (early career) maintenance -0.286
## 6 demotion - zone 4 to zone 2 (mid career) human -0.652
## 7 demotion - zone 4 to zone 2 (mid career) marketing -0.596
## 8 demotion - zone 4 to zone 2 (mid career) health -0.294
## 9 demotion - zone 4 to zone 2 (mid career) sales -0.292
## 10 demotion - zone 4 to zone 2 (mid career) legal -0.169
## # … with 90 more rows
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 101 x 3
## # Groups: cluster_long [20]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 2 (early career) culinary_entertainment 4.73
## 2 demotion - zone 4 to zone 2 (early career) architecture 4.73
## 3 demotion - zone 4 to zone 2 (early career) theology_religion 4.29
## 4 demotion - zone 4 to zone 2 (early career) education 3.58
## 5 demotion - zone 4 to zone 2 (early career) family_consumer 3.58
## 6 demotion - zone 4 to zone 2 (mid career) recreation_kinesiology 2.99
## 7 demotion - zone 4 to zone 2 (mid career) law 2.63
## 8 demotion - zone 4 to zone 2 (mid career) homeland_security 2.42
## 9 demotion - zone 4 to zone 2 (mid career) language_linguistics 2.42
## 10 demotion - zone 4 to zone 2 (mid career) public_administration 1.85
## # … with 91 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 101 x 3
## # Groups: cluster_long [20]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 demotion - zone 4 to zone 2 (early career) public_administration -0.591
## 2 demotion - zone 4 to zone 2 (early career) communication -0.316
## 3 demotion - zone 4 to zone 2 (early career) business -0.241
## 4 demotion - zone 4 to zone 2 (early career) engineering_tech -0.152
## 5 demotion - zone 4 to zone 2 (early career) computer -0.103
## 6 demotion - zone 4 to zone 2 (mid career) engineering -0.533
## 7 demotion - zone 4 to zone 2 (mid career) computer -0.432
## 8 demotion - zone 4 to zone 2 (mid career) communication -0.404
## 9 demotion - zone 4 to zone 2 (mid career) physics -0.300
## 10 demotion - zone 4 to zone 2 (mid career) engineering_tech -0.261
## # … with 91 more rows
# Visualizing 30 years 10 cluster solution -------------------------------------------------
seq.30cluster_10$cluster$stats
## PBC HG HGSD ASW ASWw CH R2
## 0.5528748 0.8025530 0.8025144 0.2035540 0.2280479 21.4122729 0.3788215
## CHsq R2sq HC
## 47.5151041 0.5750609 0.1226446
cluster_20 <- seq.30cluster_10$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.30)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
cluster %in% c(3, 40, 122) ~ "education",
cluster %in% c(135, 325) ~ "promotion",
cluster %in% c() ~ "demotion",
cluster %in% c(302, 303) ~ "unemployment",
cluster %in% c(186, 238, 314) ~ "no change",
),
cluster_label = case_when(
cluster == 3 ~ "zone 4 to education to zone 4 and unemmployment",
cluster == 40 ~ "zone 4 to education to zone 4",
cluster == 122 ~ "zone 4 to long unemployment to education to zone 5",
cluster == 135 ~ "zone 4 to zone 5 to zone 4",
cluster == 186 ~ "zone 5",
cluster == 238 ~ "zone 3",
cluster == 302 ~ "zone 4 to zone 3 to unemployment",
cluster == 303 ~ "zone 4 to unemployment to zone 4",
cluster == 314 ~ "zone 4",
cluster == 325 ~ "zone 2 to zone 5 to zone 4"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")
# Plotting
seqdplot(seq.30, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
seqrplot(seq.30, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, with.legend = F, sortv = "from.start", diss = seq.30cluster_10$diss,
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5"))
all_df <- all_df %>% mutate(end_onet55_label = case_when(
end_onet55 == "55-3011.00" ~ "Air Crew Members",
end_onet55 == "55-1011.00" ~ "Air Crew Officers",
end_onet55 == "55-1012.00" ~ "Aircraft Launch and Recovery Officers",
end_onet55 == "55-3012.00" ~ "Aircraft Launch and Recovery Specialists",
end_onet55 == "55-3013.00" ~ "Armored Assault Vehicle Crew Members",
end_onet55 == "55-1013.00" ~ "Armored Assault Vehicle Officers",
end_onet55 == "55-3014.00" ~ "Artillery and Missile Crew Members",
end_onet55 == "55-1014.00" ~ "Artillery and Missile Officers",
end_onet55 == "55-1015.00" ~ "Command and Control Center Officers",
end_onet55 == "55-3015.00" ~ "Command and Control Center Specialists",
end_onet55 == "55-2011.00" ~ "First-Line Supervisors of Air Crew Members",
end_onet55 == "55-2013.00" ~ "First-Line Supervisors of All Other Tactical Operations Specialists",
end_onet55 == "55-2012.00" ~ "First-Line Supervisors of Weapons Specialists/Crew Members",
end_onet55 == "55-3016.00" ~ "Infantry",
end_onet55 == "55-1016.00" ~ "Infantry Officers",
end_onet55 == "55-3019.00" ~ "Military Enlisted Tactical Operations and Air/Weapons Specialists and Crew Members, All Other",
end_onet55 == "55-1019.00" ~ "Military Officer Special and Tactical Operations Leaders, All Other",
end_onet55 == "55-3018.00" ~ "Special Forces",
end_onet55 == "55-1017.00" ~ "Special Forces Officers",
), years_in_military_bin = factor(years_in_military_bin, levels = c("1 - 2", "3 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")),
year_military_enlist_bin = factor(year_military_enlist_bin, levels = c("Vietnam era (1964 to 1974)", "1975 to 1989", "1990 to 2000 (including Persian Gulf War)", "2001 or later")),
career_before_military_bin = factor(career_before_military_bin, levels = c("0", "1 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")),
year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization(officer)
cluster_membership_visualization(end_onet55_label)
cluster_membership_visualization(years_in_military_bin)
cluster_membership_visualization(year_military_enlist_bin)
cluster_membership_visualization(career_before_military_bin)
cluster_membership_visualization_skillmajor(end_onet55_label)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)
df <- all_df %>%
filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 50 x 3
## # Groups: cluster_long [10]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 education - zone 4 to education to zone 4 design 9.03
## 2 education - zone 4 to education to zone 4 human 1.51
## 3 education - zone 4 to education to zone 4 informati… 0.527
## 4 education - zone 4 to education to zone 4 engineeri… 0.114
## 5 education - zone 4 to education to zone 4 education 0.00263
## 6 education - zone 4 to education to zone 4 and unemmplo… human 3.88
## 7 education - zone 4 to education to zone 4 and unemmplo… health 1.44
## 8 education - zone 4 to education to zone 4 and unemmplo… finance 0.628
## 9 education - zone 4 to education to zone 4 and unemmplo… informati… 0.553
## 10 education - zone 4 to education to zone 4 and unemmplo… business -0.542
## # … with 40 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 50 x 3
## # Groups: cluster_long [10]
## cluster_long top_skill difference
## <chr> <chr> <dbl>
## 1 education - zone 4 to education to zone 4 finance -0.443
## 2 education - zone 4 to education to zone 4 sales -0.410
## 3 education - zone 4 to education to zone 4 business -0.373
## 4 education - zone 4 to education to zone 4 security -0.0885
## 5 education - zone 4 to education to zone 4 marketing -0.0885
## 6 education - zone 4 to education to zone 4 and unemmplo… business -0.542
## 7 education - zone 4 to education to zone 4 and unemmplo… informati… 0.553
## 8 education - zone 4 to education to zone 4 and unemmplo… finance 0.628
## 9 education - zone 4 to education to zone 4 and unemmplo… health 1.44
## 10 education - zone 4 to education to zone 4 and unemmplo… human 3.88
## # … with 40 more rows
df <- all_df %>%
filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)
df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 44 x 3
## # Groups: cluster_long [10]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 education - zone 4 to education to zone 4 physics 4.69
## 2 education - zone 4 to education to zone 4 philosophy_religi… 4.69
## 3 education - zone 4 to education to zone 4 culinary_entertai… 4.69
## 4 education - zone 4 to education to zone 4 health_professions 4.69
## 5 education - zone 4 to education to zone 4 engineering_tech 1.84
## 6 education - zone 4 to education to zone 4 and … communication 3.33
## 7 education - zone 4 to education to zone 4 and … engineering 0.784
## 8 education - zone 4 to education to zone 4 and … social_sciences 0.569
## 9 education - zone 4 to education to zone 4 and … public_administra… 0.517
## 10 education - zone 4 to education to zone 4 and … computer -0.278
## # … with 34 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 44 x 3
## # Groups: cluster_long [10]
## cluster_long top_major difference
## <chr> <chr> <dbl>
## 1 education - zone 4 to education to zone 4 social_sciences -0.608
## 2 education - zone 4 to education to zone 4 engineering -0.498
## 3 education - zone 4 to education to zone 4 psychology -0.368
## 4 education - zone 4 to education to zone 4 language_linguis… -0.0521
## 5 education - zone 4 to education to zone 4 business 0.185
## 6 education - zone 4 to education to zone 4 and u… business -0.684
## 7 education - zone 4 to education to zone 4 and u… computer -0.278
## 8 education - zone 4 to education to zone 4 and u… public_administr… 0.517
## 9 education - zone 4 to education to zone 4 and u… social_sciences 0.569
## 10 education - zone 4 to education to zone 4 and u… engineering 0.784
## # … with 34 more rows
library(RColorBrewer)
overall <- all_df %>% filter(!is.na(top_skill)) %>% mutate(total = n()) %>% group_by(cluster) %>% mutate(overall_cluster_count = n()) %>% group_by(top_skill) %>% mutate(overall_covariate_percent = round((n()/total * 100), 2)) %>% distinct(top_skill, overall_covariate_percent) %>% mutate(cluster_label = "overall", cluster_group = "overall", overall_cluster_percent = .1) %>% rename(cluster_covariate_percent = overall_covariate_percent)
nb.cols <- length(unique(all_df$top_skill))
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
all_df %>% filter(!is.na(top_skill)) %>% mutate(total = n()) %>% group_by(cluster) %>% mutate(overall_cluster_count = n(), overall_cluster_percent = n()/total) %>% group_by(top_skill) %>% mutate(overall_covariate_percent = round((n()/total * 100), 2)) %>% group_by(cluster, top_skill) %>% mutate(cluster_covariate_percent = round((n()/overall_cluster_count * 100), 2)) %>% distinct(cluster, cluster_label, cluster_group, top_skill, cluster_covariate_percent, overall_cluster_percent) %>% group_by(top_skill) %>% left_join(select(overall, overall = cluster_covariate_percent), by = "top_skill") %>% rbind(overall) %>% mutate(alpha = ifelse(!is.na(overall), abs((overall - cluster_covariate_percent)/overall), max(abs((overall - cluster_covariate_percent)/overall)))) %>%
ggplot(aes(cluster_covariate_percent, cluster_label, fill = factor(top_skill), alpha = factor(alpha))) +
geom_col(aes(width = overall_cluster_percent*10)) +
facet_wrap(~ factor(cluster_group, levels = c("overall", "education", "promotion", "no change", "demotion", "unemployment")), ncol = 1, scales="free") + scale_fill_manual(values = mycolors) +
scale_alpha_discrete(guide = "none")
#cluster_membership_visualization(skill)
#cluster_membership_visualization(major)
top_skill <- all_df %>%
filter(!is.na(top_skill)) %>%
group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(top_skill) %>%
mutate(total = n()) %>%
group_by(cluster, top_skill) %>%
mutate(percent = n()/total * 100, difference = (percent - cluster_percent)/percent) %>%
distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, cluster_percent, difference) %>%
group_by(top_skill) %>% mutate(sum = sum(percent))
top_skill$cluster <- as.factor(top_skill$cluster)
top_skill <- top_skill %>% filter(!is.na(top_skill))
top_skill <- as.data.frame(top_skill)
ggplot(top_skill, aes(cluster_label, top_skill, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, limits = c(-10, 10))+
theme_classic()+
labs(title = "Cluster membership by top_skill") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_grid(~cluster_group, switch = "x", scales = "free_x", space = "free_x")
top_major <- all_df %>%
filter(!is.na(top_major)) %>%
group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(top_major) %>%
mutate(total = n()) %>%
group_by(cluster, top_major) %>%
mutate(percent = n()/total * 100, difference = (percent - cluster_percent)/percent) %>%
distinct(top_major, cluster_long, cluster_group, cluster_label, percent, cluster_percent, difference) %>%
group_by(top_major) %>% mutate(sum = sum(percent))
top_major$cluster <- as.factor(top_major$cluster)
top_major <- top_major %>% filter(!is.na(top_major))
top_major <- as.data.frame(top_major)
ggplot(top_major, aes(cluster_label, top_major, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, limits = c(-10, 10))+
theme_classic()+
labs(title = "Cluster membership by top_major") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_grid(~cluster_group, switch = "x", scales = "free_x", space = "free_x")
#install.packages("ggpubr", dependencies = TRUE)
#ggballoonplot(aes(x = all_df$cluster, y = all_df%top_skill))
#install.packages("FactoMineR", dependencies = TRUE)
#install.packages("factoextra", dependencies = TRUE)
#library(FactoMineR)
library(ca)
test <- all_df %>% select(cluster_label, cluster_group, top_skill) %>% mutate(value = 1)
table <- dcast(test, cluster_label ~ top_skill, fun.aggregate = sum)
table <- table %>% remove_rownames %>% column_to_rownames(var = "cluster_label")
#as.table(table)
plot(ca(table))
test <- all_df %>% select(cluster_label, cluster_group, top_major) %>% mutate(value = 1)
table <- dcast(test, cluster_label ~ top_major, fun.aggregate = sum)
table <- table %>% remove_rownames %>% column_to_rownames(var = "cluster_label")
#as.table(table)
plot(ca(table))
all_df %>% select(cluster_label, cluster_group, top_skill) %>% mutate(count = 1, total = n()) %>% group_by(top_skill) %>% mutate(skill_total = sum(count)) %>% ungroup() %>% group_by(cluster_label) %>% mutate(cluster_total = sum(count)) %>% group_by(cluster_label, top_skill) %>% mutate(value = sum(count)/cluster_total) %>%
ggplot(aes(x = cluster_label, y = top_skill)) +
geom_point(aes(size=value),shape=21, colour="black", fill="skyblue")+
theme(panel.background=element_blank(), panel.border = element_rect(colour = "blue", fill=NA, size=1))+
scale_size_area(max_size=20)+
facet_grid(~cluster_group, switch = "x", scales = "free_x", space = "free_x") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
cluster_membership_visualization_skillmajor(skill)
all_df %>% filter(!is.na(highest_degree), highest_degree != "post-bacc") %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(highest_degree) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(highest_degree, cluster, percent, cluster_percent, difference) %>%
group_by(highest_degree) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by highest degree")
all_df %>% group_by(cluster) %>% filter(!is.na(gender)) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(gender) %>%
mutate(total = n()) %>%
group_by(cluster, gender) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(gender, cluster, percent, cluster_percent, difference) %>%
group_by(gender) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, gender, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by gender")
all_df %>% group_by(cluster) %>% filter(highest_degree_from_forprofit != "highschool") %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(highest_degree_from_forprofit) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree_from_forprofit) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(highest_degree_from_forprofit, cluster, percent, cluster_percent, difference) %>%
group_by(highest_degree_from_forprofit) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_forprofit, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by highest degree from forprofit")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(highest_degree_from_hbcu) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree_from_hbcu) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(highest_degree_from_hbcu, cluster, percent, cluster_percent, difference) %>%
group_by(highest_degree_from_hbcu) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_hbcu, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by highest degree from hbcu")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(irr_bin) %>%
mutate(total = n()) %>%
group_by(cluster, irr_bin) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(irr_bin, cluster, percent, cluster_percent, difference) %>%
group_by(irr_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, irr_bin, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by IRR")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(officer) %>%
mutate(total = n()) %>%
group_by(cluster, officer) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(officer, cluster, percent, cluster_percent, difference) %>%
group_by(officer) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, officer, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by officer")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(end_onet55) %>%
mutate(total = n()) %>% filter(total > 10) %>%
group_by(cluster, end_onet55) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(end_onet55, cluster, percent, cluster_percent, difference) %>%
group_by(end_onet55) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, end_onet55, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by ending onet 55")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(year_military_enlist_bin) %>%
mutate(total = n()) %>%
group_by(cluster, year_military_enlist_bin) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(cluster, year_military_enlist_bin, percent, cluster_percent, difference) %>%
group_by(year_military_enlist_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_military_enlist_bin, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by year military enlist")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(year_entered_workforce_bin) %>%
mutate(total = n()) %>%
group_by(cluster, year_entered_workforce_bin) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(cluster, year_entered_workforce_bin, percent, cluster_percent, difference) %>%
group_by(year_entered_workforce_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_entered_workforce_bin, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by year entered the workforce")
# Visualizing 10 years 20 cluster solution
seq.20cluster_20$stats
cluster_20 <- seq.20cluster_20$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id")
veteran <- all_df %>%
filter(!is.na(veteran)) %>%
group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(veteran) %>%
mutate(total = n()) %>%
group_by(cluster, veteran) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(veteran, cluster, percent, cluster_percent, difference) %>%
group_by(veteran) %>% mutate(sum = sum(percent))
veteran$cluster <- as.factor(veteran$cluster)
veteran <- veteran %>% filter(!is.na(veteran))
veteran <- as.data.frame(veteran)
seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
ggplot(veteran, aes(cluster, veteran, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Clustering membership by veteran")
# Visualizing 30 years 20 cluster solution
seq.30cluster_5$stats
cluster_5 <- seq.30cluster_5$clustering
cluster_5_fac <- factor(cluster_5)
all <- rownames(seq.30)
all_df <- as.data.frame(cbind(all, cluster_5))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id")
veteran <- all_df %>%
filter(!is.na(veteran)) %>%
group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(veteran) %>%
mutate(total = n()) %>%
group_by(cluster, veteran) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(veteran, cluster, percent, cluster_percent, difference) %>%
group_by(veteran) %>% mutate(sum = sum(percent))
veteran$cluster <- as.factor(veteran$cluster)
veteran <- veteran %>% filter(!is.na(veteran))
veteran <- as.data.frame(veteran)
seqdplot(seq.30, group = cluster_5_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
ggplot(veteran, aes(cluster, veteran, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Clustering membership by veteran")
all_df %>% filter(!is.na(highest_degree), highest_degree != "post-bacc") %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(highest_degree) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(highest_degree, cluster, percent, cluster_percent, difference) %>%
group_by(highest_degree) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by highest degree")
all_df %>% group_by(cluster) %>% filter(!is.na(gender)) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(gender) %>%
mutate(total = n()) %>%
group_by(cluster, gender) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(gender, cluster, percent, cluster_percent, difference) %>%
group_by(gender) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, gender, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by gender")
all_df %>% group_by(cluster) %>% filter(highest_degree_from_forprofit != "highschool") %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(highest_degree_from_forprofit) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree_from_forprofit) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(highest_degree_from_forprofit, cluster, percent, cluster_percent, difference) %>%
group_by(highest_degree_from_forprofit) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_forprofit, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by highest degree from forprofit")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(highest_degree_from_hbcu) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree_from_hbcu) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(highest_degree_from_hbcu, cluster, percent, cluster_percent, difference) %>%
group_by(highest_degree_from_hbcu) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_hbcu, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by highest degree from hbcu")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(irr_bin) %>%
mutate(total = n()) %>%
group_by(cluster, irr_bin) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(irr_bin, cluster, percent, cluster_percent, difference) %>%
group_by(irr_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, irr_bin, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by IRR")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(officer) %>%
mutate(total = n()) %>%
group_by(cluster, officer) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(officer, cluster, percent, cluster_percent, difference) %>%
group_by(officer) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, officer, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by officer")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(end_onet55) %>%
mutate(total = n()) %>% filter(total > 10) %>%
group_by(cluster, end_onet55) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(end_onet55, cluster, percent, cluster_percent, difference) %>%
group_by(end_onet55) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, end_onet55, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by ending onet 55")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(year_military_enlist_bin) %>%
mutate(total = n()) %>%
group_by(cluster, year_military_enlist_bin) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(cluster, year_military_enlist_bin, percent, cluster_percent, difference) %>%
group_by(year_military_enlist_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_military_enlist_bin, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by year military enlist")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(year_entered_workforce_bin) %>%
mutate(total = n()) %>%
group_by(cluster, year_entered_workforce_bin) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(cluster, year_entered_workforce_bin, percent, cluster_percent, difference) %>%
group_by(year_entered_workforce_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_entered_workforce_bin, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by year entered the workforce")
# Visualizing 20 years 20 cluster solution
seq.20cluster_20$stats
cluster_20 <- seq.20cluster_20$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.20)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)
all_df <- left_join(all_df, covariates, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
# Plotting
seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
all_df %>% filter(!is.na(highest_degree), highest_degree != "post-bacc") %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(highest_degree) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(highest_degree, cluster, percent, cluster_percent, difference) %>%
group_by(highest_degree) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by highest degree")
all_df %>% group_by(cluster) %>% filter(!is.na(gender)) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(gender) %>%
mutate(total = n()) %>%
group_by(cluster, gender) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(gender, cluster, percent, cluster_percent, difference) %>%
group_by(gender) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, gender, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by gender")
all_df %>% group_by(cluster) %>% filter(highest_degree_from_forprofit != "highschool") %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(highest_degree_from_forprofit) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree_from_forprofit) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(highest_degree_from_forprofit, cluster, percent, cluster_percent, difference) %>%
group_by(highest_degree_from_forprofit) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_forprofit, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by highest degree from forprofit")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(highest_degree_from_hbcu) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree_from_hbcu) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(highest_degree_from_hbcu, cluster, percent, cluster_percent, difference) %>%
group_by(highest_degree_from_hbcu) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_hbcu, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by highest degree from hbcu")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(irr_bin) %>%
mutate(total = n()) %>%
group_by(cluster, irr_bin) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(irr_bin, cluster, percent, cluster_percent, difference) %>%
group_by(irr_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, irr_bin, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by IRR")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(officer) %>%
mutate(total = n()) %>%
group_by(cluster, officer) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(officer, cluster, percent, cluster_percent, difference) %>%
group_by(officer) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, officer, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by officer")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(end_onet55) %>%
mutate(total = n()) %>% filter(total > 10) %>%
group_by(cluster, end_onet55) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(end_onet55, cluster, percent, cluster_percent, difference) %>%
group_by(end_onet55) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, end_onet55, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by ending onet 55")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(year_military_enlist_bin) %>%
mutate(total = n()) %>%
group_by(cluster, year_military_enlist_bin) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(cluster, year_military_enlist_bin, percent, cluster_percent, difference) %>%
group_by(year_military_enlist_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_military_enlist_bin, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by year military enlist")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
group_by(year_entered_workforce_bin) %>%
mutate(total = n()) %>%
group_by(cluster, year_entered_workforce_bin) %>%
mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
distinct(cluster, year_entered_workforce_bin, percent, cluster_percent, difference) %>%
group_by(year_entered_workforce_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_entered_workforce_bin, fill= difference)) +
geom_tile()+
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
theme_classic()+
labs(title = "Cluster membership by year entered the workforce")
vet.seq <- create_seq_vet_fixed("vet.seq", 10, left_unemp = TRUE)
transition_matrix <- seqtrate(vet.seq, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
diag(transition_matrix) = 0
round(transition_matrix,2)
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
for (j in 1:ncol(transition_matrix)){
transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
}
}
for(j in 1:ncol(transition_matrix)){
transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)
# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing.opt1 <- seqdist(vet.seq, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)
# OPTION 2 : OUR OM
sequencing.opt2 <- seqdist(vet.seq, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)
# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing.opt3 <- seqdist(vet.seq, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)
# CLUSTERING
numbers <- (2:20)
pamstats <- NULL
#rownames(pamstats) <- numbers
pamstats <- NULL
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt1, k = number)
assign(paste0("sequencing.pam.opt1.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt1 <- dplyr::bind_rows(pamstats)
pamstats <- NULL
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt2, k = number)
assign(paste0("sequencing.pam.opt2.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt2 <- dplyr::bind_rows(pamstats)
for (number in numbers) {
clusterpam <- wcKMedoids(sequencing.opt3, k = number)
assign(paste0("sequencing.pam.opt3.", number), clusterpam, envir = .GlobalEnv)
pamstats[[number]] <- clusterpam$stats
}
pamstats.opt3 <- dplyr::bind_rows(pamstats)
# QUALITY METRICS
rownames(pamstats.opt1) <- numbers
PAM.ASW.Opt1 <- as.matrix(pamstats.opt1$ASW)
colnames(PAM.ASW.Opt1) <- c("ASW opt1")
PAM.ASW.Opt2 <- as.matrix(pamstats.opt2$ASW)
colnames(PAM.ASW.Opt2) <- c("ASW opt2")
PAM.ASW.Opt3 <- as.matrix(pamstats.opt3$ASW)
colnames(PAM.ASW.Opt3) <- c("ASW opt3")
PAM.ASW <- bind_cols(PAM.ASW.Opt1, PAM.ASW.Opt2, PAM.ASW.Opt3)
PAM.ASW <- PAM.ASW %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.ASW$rowname <- as.numeric(PAM.ASW$rowname)
rownames(pamstats.opt1) <- numbers
PAM.HC.Opt1 <- as.matrix(pamstats.opt1$HC)
colnames(PAM.HC.Opt1) <- c("HC opt1")
PAM.HC.Opt2 <- as.matrix(pamstats.opt2$HC)
colnames(PAM.HC.Opt2) <- c("HC opt2")
PAM.HC.Opt3 <- as.matrix(pamstats.opt3$HC)
colnames(PAM.HC.Opt3) <- c("HC opt3")
PAM.HC <- bind_cols(PAM.HC.Opt1, PAM.HC.Opt2, PAM.HC.Opt3)
PAM.HC <- PAM.HC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.HC$rowname <- as.numeric(PAM.HC$rowname)
rownames(pamstats.opt1) <- numbers
PAM.PBC.Opt1 <- as.matrix(pamstats.opt1$PBC)
colnames(PAM.PBC.Opt1) <- c("PBC opt1")
PAM.PBC.Opt2 <- as.matrix(pamstats.opt2$PBC)
colnames(PAM.PBC.Opt2) <- c("PBC opt2")
PAM.PBC.Opt3 <- as.matrix(pamstats.opt3$PBC)
colnames(PAM.PBC.Opt3) <- c("PBC opt3")
PAM.PBC <- bind_cols(PAM.PBC.Opt1, PAM.PBC.Opt2, PAM.PBC.Opt3)
PAM.PBC <- PAM.PBC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.PBC$rowname <- as.numeric(PAM.PBC$rowname)
rownames(pamstats.opt1) <- numbers
PAM.R2.Opt1 <- as.matrix(pamstats.opt1$R2)
colnames(PAM.R2.Opt1) <- c("R2 opt1")
PAM.R2.Opt2 <- as.matrix(pamstats.opt2$R2)
colnames(PAM.R2.Opt2) <- c("R2 opt2")
PAM.R2.Opt3 <- as.matrix(pamstats.opt3$R2)
colnames(PAM.R2.Opt3) <- c("R2 opt3")
PAM.R2 <- bind_cols(PAM.R2.Opt1, PAM.R2.Opt2, PAM.R2.Opt3)
PAM.R2 <- PAM.R2 %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.R2$rowname <- as.numeric(PAM.R2$rowname)
PAM.ASW %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "ASW value", type = "viridis") +
labs(title = "Sequencing - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.HC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "ASW value", type = "viridis") +
labs(title = "Sequencing - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.PBC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "ASW value", type = "viridis") +
labs(title = "Sequencing - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
PAM.R2 %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
#facet_grid(. ~ group, scales = "free") +
geom_tile() +
scale_fill_continuous(name = "ASW value", type = "viridis") +
labs(title = "Sequencing - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
ylab("number of clusters") +
ylim(1,21) +
theme_bw()
clusterpam <- wcKMedoids(sequencing.opt2, k = 20)
clusterpam$stats
clusterpam_20 <- clusterpam$clustering
clusterpam_20_fac <- factor(clusterpam_20)#, #labels = paste("Type", 1:20))
seqdplot(vet.seq, group = clusterpam_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
seqlegend(vet.seq, cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
vet <- rownames(vet.seq)
vet_df <- as.data.frame(cbind(vet, clusterpam_20))
colnames(vet_df) <- c("id", "cluster")
bg_covariates$id <- as.character(bg_covariates$id)
vet_df <- left_join(vet_df, bg_covariates, by = "id")
degree <- vet_df %>%
filter(!is.na(highest_degree)) %>%
group_by(highest_degree) %>%
mutate(total = n()) %>%
group_by(cluster, highest_degree) %>%
mutate(percent = n()/total) %>% distinct(highest_degree, cluster, percent) %>%
group_by(highest_degree) %>% mutate(sum = sum(percent))
degree$cluster <- as.factor(degree$cluster)
degree$highest_degree <- factor(degree$highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd"))
degree <- degree %>% filter(!is.na(highest_degree))
degree <- as.data.frame(degree)
ggplot(degree, aes(cluster, highest_degree, fill= percent)) +
geom_tile()+
scale_fill_gradient(low="white", high="blue")+
theme_classic()+
labs(title = "Clustering X Degree --before normalization")
year_entered_workforce_bin <- vet_df %>%
filter(!is.na(year_entered_workforce_bin)) %>%
group_by(year_entered_workforce_bin) %>%
mutate(total = n()) %>%
group_by(cluster, year_entered_workforce_bin) %>%
mutate(percent = n()/total) %>% distinct(cluster, year_entered_workforce_bin, percent) %>%
group_by(year_entered_workforce_bin) %>% mutate(sum = sum(percent))
year_entered_workforce_bin$cluster <- as.factor(year_entered_workforce_bin$cluster)
#irr_bin$highest_degree <- factor(irr_bin$highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd"))
#degree <- degree %>% filter(!is.na(highest_degree))
year_entered_workforce_bin <- as.data.frame(year_entered_workforce_bin)
seqdplot(vet.seq, group = clusterpam_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
ggplot(year_entered_workforce_bin, aes(cluster, year_entered_workforce_bin, fill= percent)) +
geom_tile()+
scale_fill_gradient(low="white", high="blue")+
theme_classic()+
labs(title = "Clustering X Degree --before normalization")
transition_matrix <- seqtrate(seq.20, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
diag(transition_matrix) = 0
round(transition_matrix,2)
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
for (j in 1:ncol(transition_matrix)){
transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
}
}
for(j in 1:ncol(transition_matrix)){
transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)
sequencing.opt1 <- seqdist(seq.10, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)
clusterpam <- wcKMedoids(sequencing.opt1, k = 20)
clusterpam$stats
clusterpam_20 <- clusterpam$clustering
clusterpam_20_fac <- factor(clusterpam_20)#, #labels = paste("Type", 1:20))
#seqdplot(seq.20, group = clusterpam_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
# cpal = c(uva_color_palette[1], uva_color_palette[2],
# uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
# uva_color_palette[5], "#CBBEB5", "dark gray"))
#seqlegend(seq.10, cpal = c(uva_color_palette[1], uva_color_palette[2],
# uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
# uva_color_palette[5], "#CBBEB5", "dark gray"))
all <- rownames(seq.20)
all_df <- as.data.frame(cbind(all, clusterpam_20))
colnames(all_df) <- c("id", "cluster")
bg_covariates$id <- as.character(bg_covariates$id)
all_df <- left_join(all_df, bg_covariates, by = "id")
veteran <- all_df %>%
filter(!is.na(veteran)) %>%
group_by(veteran) %>%
mutate(total = n()) %>%
group_by(cluster, veteran) %>%
mutate(percent = n()/total) %>% distinct(veteran, cluster, percent) %>%
group_by(veteran) %>% mutate(sum = sum(percent))
veteran$cluster <- as.factor(veteran$cluster)
#degree$veteran <- factor(degree$highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd"))
veteran <- veteran %>% filter(!is.na(veteran))
veteran <- as.data.frame(veteran)
seqdplot(seq.20, group = clusterpam_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
cpal = c(uva_color_palette[1], uva_color_palette[2],
uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
uva_color_palette[5], "#CBBEB5", "dark gray"))
ggplot(veteran, aes(cluster, veteran, fill= percent)) +
geom_tile()+
scale_fill_gradient(low="white", high="blue")+
theme_classic()+
labs(title = "Clustering X Degree --before normalization")